library(GEOquery)
library(crayon)
library(tidyverse)
library(Seurat)
library(SeuratData)
library(Azimuth)
library(patchwork)
library(Matrix)
library(RCurl)
library(DoubletFinder)
library(scales)
library(SoupX)
library(cowplot)
library(metap)
library(SingleCellExperiment)
library(DropletUtils)
library(AnnotationHub)
library(HGNChelper)
library(ensembldb)
library(multtest)
library(glmGamPoi)
library(pbapply)
library(data.table)
library(ggrepel)
library(ggpubr)
library(stringr)
library(canvasXpress)
library(clinfun)
library(GGally)
library(factoextra)
library(DESeq2)
library(limma)
library(fgsea)
library(org.Mm.eg.db)
library(kableExtra)
# Specify path to supplementary files.
supp_file_path = "~/Documents/Work/UnityHealth/Data_Projects/scRNAseq_LungFibrosisAnalysis/GSE264162/"
# Get unique sample names.
sample_names = list.files(path = paste0(supp_file_path, "raw_data"), pattern = "GSM", full.names = FALSE)
sample_names
## [1] "GSM8213185_LG_AA_S1_1" "GSM8213185_LG_AA_S1_2" "GSM8213186_LG_AA_S2_1"
## [4] "GSM8213186_LG_AA_S2_2" "GSM8213187_LG_AA_S3_1" "GSM8213187_LG_AA_S3_2"
## [7] "GSM8213188_LG_AA_S4_1" "GSM8213188_LG_AA_S4_2" "GSM8213189_LG_AA_S5_1"
## [10] "GSM8213189_LG_AA_S5_2" "GSM8213190_LG_AA_S6_1" "GSM8213190_LG_AA_S6_2"
## [13] "GSM8213191_LG_AA_S7_1" "GSM8213191_LG_AA_S7_2"
# Create Seurat object for each sample.
for (file in sample_names){
seurat_data <- Read10X(data.dir = paste0(supp_file_path, "raw_data/", file))
seurat_obj <- CreateSeuratObject(counts = seurat_data,
min.features = 100,
project = file)
assign(file, seurat_obj)
}
# Remove unneeded objects from the environment.
rm(file, seurat_data, seurat_obj)
# Merge Seurat objects.
merged_seurat <- merge(x = GSM8213185_LG_AA_S1_1,
y = c(GSM8213185_LG_AA_S1_2,
GSM8213186_LG_AA_S2_1,
GSM8213186_LG_AA_S2_2,
GSM8213187_LG_AA_S3_1,
GSM8213187_LG_AA_S3_2,
GSM8213188_LG_AA_S4_1,
GSM8213188_LG_AA_S4_2,
GSM8213189_LG_AA_S5_1,
GSM8213189_LG_AA_S5_2,
GSM8213190_LG_AA_S6_1,
GSM8213190_LG_AA_S6_2,
GSM8213191_LG_AA_S7_1,
GSM8213191_LG_AA_S7_2),
add.cell.id = c("Young_Sham_1_rep1",
"Young_Sham_1_rep2",
"Young_Sham_2_rep1",
"Young_Sham_2_rep2",
"Young_Sham_3_rep1",
"Young_Sham_3_rep2",
"Bleo_14_days_1_rep1",
"Bleo_14_days_1_rep2",
"Bleo_14_days_2_rep1",
"Bleo_14_days_2_rep2",
"Bleo_35_days_1_rep1",
"Bleo_35_days_1_rep2",
"Bleo_35_days_2_rep1",
"Bleo_35_days_2_rep2"))
# Concatenate the count matrices of the samples together.
merged_seurat <- JoinLayers(merged_seurat)
# Remove individual Seurat objects after successful merge.
rm(list = ls(pattern = "GSM"))
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 15829460 845.4 25595414 1367.0 NA 25595414 1367.0
## Vcells 333236570 2542.4 1571576192 11990.2 1024000 1894557522 14454.4
# Pull GEO metadata.
GSE264162_meta <- getGEO(GEO = "GSE264162", GSEMatrix = TRUE)
## Found 1 file(s)
## GSE264162_series_matrix.txt.gz
GSE264162_meta <- pData(GSE264162_meta$GSE264162_series_matrix.txt.gz)
# Subset metadata.
GSE264162_meta <- GSE264162_meta %>% subset(select = c(title,
geo_accession,
organism_ch1,
characteristics_ch1.5,
`age:ch1`,
`cell type:ch1`,
`genotype:ch1`,
`strain:ch1`,
`tissue:ch1`))
# Rename columns to more accessible names.
colnames(GSE264162_meta) <- c("title",
"geo_accession",
"organism",
"treatment",
"age",
"cell_type",
"genotype",
"strain",
"tissue")
# Edit the metadata columns.
GSE264162_meta$title <- gsub(" ", "_", GSE264162_meta$title)
GSE264162_meta$title <- gsub("-", "_", GSE264162_meta$title)
GSE264162_meta$treatment <- gsub("treatment: ", "", GSE264162_meta$treatment)
# Add timepoint variable.
GSE264162_meta$timepoint <- ifelse(GSE264162_meta$treatment == "Sham", "Sham", "")
GSE264162_meta$timepoint <- ifelse(GSE264162_meta$timepoint == "", paste0("Day ",
gsub("Bleo_", "",
gsub("_days_1", "",
gsub("_days_2", "",
GSE264162_meta$title)))),
GSE264162_meta$timepoint)
We can merged the metadata from GEO and put the merged metadata back into the Seurat object.
# Get metadata from Seurat object.
metadata = merged_seurat@meta.data %>% rownames_to_column()
dim(metadata)
## [1] 86347 4
# Create a variable column to merge dataframes.
metadata$geo_accession <- str_extract(metadata$orig.ident, "[^_]+")
# Merge metadata with GEO metadata by the newly created geo_accession column.
metadata <- merge(metadata, GSE264162_meta, by = "geo_accession")
dim(metadata)
## [1] 86347 14
# Edit and retain only required columns.
metadata <- metadata %>% column_to_rownames() %>% subset(select = -geo_accession)
# Generate quality metrics.
names(metadata)[names(metadata) == "nCount_RNA"] <- "nUMI"
names(metadata)[names(metadata) == "nFeature_RNA"] <- "nGene"
# Compute mitochondrial ratio.
metadata$mitoRatio = PercentageFeatureSet(object = merged_seurat, pattern = "^mt-")
metadata$mitoRatio = metadata$mitoRatio / 100
# Calculate novelty score.
metadata$log10GenesPerUMI <- log10(metadata$nGene) / log10(metadata$nUMI)
# Add the new metadata back to the Seurat object.
merged_seurat@meta.data <- metadata
# Remove unneeded objects.
rm(GSE264162_meta, metadata, sample_names)
Check cell counts with number of unique barcodes detected to determine capture efficiency.
# Extract metadata from Seurat object.
metadata = merged_seurat@meta.data
# Visualize cell counts.
metadata %>%
ggplot(aes(x = treatment, fill = treatment)) +
geom_bar() +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
ggtitle("NCells")
The UMI counts per cell should generally be above 500, that is the low end of what we expect. If UMI counts are between 500-1000 counts, it is usable but the cells probably should have been sequenced more deeply.
# Visualize UMI counts per cell.
metadata %>%
ggplot(aes(x = nUMI, color = treatment, fill = treatment)) +
geom_density(alpha = 0.2) +
scale_x_log10(labels = label_number()) +
ylab("Cell density") +
geom_vline(xintercept = 500, linetype = "dashed", color = "red") +
theme_classic()
Generally, we expect the novelty score to be above 0.80 for good quality cells.
# Visualize novelty score.
metadata %>%
ggplot(aes(x = log10GenesPerUMI, color = treatment, fill = treatment)) +
geom_density(alpha = 0.2) +
geom_vline(xintercept = 0.8, linetype = "dashed", color = "red") +
theme_classic()
In general, poor quality samples have mitochondrial ratio higher than 0.2 mark.
# Visualize mitochondrial ratio.
metadata %>%
ggplot(aes(x = mitoRatio, color = treatment, fill = treatment)) +
geom_density(alpha = 0.2) +
scale_x_log10() +
geom_vline(xintercept = 0.2, linetype = "dashed", color = "red") +
theme_classic()
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## Warning: Removed 532 rows containing non-finite outside the scale range
## (`stat_density()`).
Considering any of these QC metrics in isolation can lead to misinterpretation of cellular signals. Jointly visualizing the count and gene thresholds and additionally overlaying the mitochondrial fraction, gives a summarized persepective of the quality per cell.
# Visualize joint filtering QCs.
metadata %>%
ggplot(aes(x = nUMI, y = nGene, color = mitoRatio)) +
geom_point() +
scale_colour_gradient(low = "gray90", high = "black") +
stat_smooth(method = lm) +
scale_x_log10() +
scale_y_log10() +
geom_vline(xintercept = 500) +
geom_hline(yintercept = 250) +
theme_classic() +
facet_wrap(~treatment)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
# Rank expression barcodes.
br.out = barcodeRanks(merged_seurat@assays$RNA$counts)
# Plot knee plot.
plot(br.out$rank, br.out$total, log = "xy", xlab = "Rank", ylab = "Total")
abline(h = metadata(br.out)$knee, col = "dodgerblue", lty = 2)
abline(h = metadata(br.out)$inflection, col = "forestgreen", lty = 2)
legend("bottomleft", lty = 2, col = c("dodgerblue", "forestgreen"), legend = c("knee", "inflection"))
# Remove unneeded object.
rm(br.out)
Before running DoubletFinder(), we need to run a quick
Seurat processing pipeline first.
# Run quick Seurat processing pipeline.
set.seed(10)
seur_obj <- NormalizeData(merged_seurat)
seur_obj <- FindVariableFeatures(seur_obj)
seur_obj <- ScaleData(seur_obj)
seur_obj <- RunPCA(seur_obj)
seur_obj <- FindNeighbors(seur_obj)
seur_obj <- FindClusters(seur_obj)
Determine the threshold for the pANN score that corresponds to the expected number of heterotypic doublets (parameter: nEXP = expected number of heterotypic doublets).
# Calculate homotypic doublet proportion.
type.freq <- table(seur_obj@meta.data$seurat_clusters) / ncol(seur_obj)
homotypic.prop <- sum(type.freq^2)
# Calculate nEXP based on number of heterotypic doublet.
nEXP = 0.009 * (ncol(seur_obj)/1000) * (1-homotypic.prop) * ncol(seur_obj)
nEXP
# Define pN and nPC.
pN = 0.25
PCs = 25
# Determine bimodality coeffient (pK).
sweep.out <- paramSweep(seur_obj, PCs = 1:PCs, sct = FALSE)
sweep.stats <- summarizeSweep(sweep.out)
Summarize sweep output to get the ideal coefficient values.
# Summarize coefficient output.
plot(sweep.stats[, 2:3])
sweep.stats[as.numeric(as.character(sweep.stats[, 2])) < 0.075 & sweep.stats[, 3] > 0.8,]
# Clear memory.
gc()
# Remove unneeded objects.
rm(homotypic.prop, sweep.out, type.freq)
pK = 0.0005 yields consistently high scores with pN = 0.05 producing the highest score.
# Run DoubletFinder using coefficients from paramSweep.
pK = 0.0005
pN = 0.05
seur_obj <- doubletFinder(seur_obj, PCs = 1:PCs, pN = pN, pK = pK, nExp = nEXP)
# DoubletFinder adds the doublet score in the meta.data slot.
head(seur_obj@metadata)
# Grab the metadata and rename relevant columns.
doublet_meta = seur_obj@meta.data
names(doublet_meta)[grep("pANN", colnames(doublet_meta))] <- "pANN"
names(doublet_meta)[grep("DF.classifications", colnames(doublet_meta))] <- "doublet_class"
# Keep relevant columns only.
doublet_meta <- doublet_meta %>%
subset(select = c("orig.ident", "pANN", "doublet_class")) %>%
rownames_to_column("matched_id")
# Get metadata from the Seurat object to merge.
metadata = merged_seurat@meta.data %>%
rownames_to_column("matched_id")
# Check if matched_id for both dataframes match before merging.
identical(metadata$matched_id, doublet_meta$matched_id) # Must be TRUE.
# Merge both metadata together.
metadata <- left_join(metadata, doublet_meta, by = "matched_id") %>%
subset(select = -orig.ident.y) %>%
column_to_rownames("matched_id")
# Add merged metadata back to the Seurat object.
all(colnames(merged_seurat) == rownames(metadata)) # Must be TRUE.
merged_seurat@meta.data <- metadata
Now that we have visualized the various metrics, we can decide on the thresholds to apply which will result in the removal of low quality cells. We will use the following thresholds:
nUMI > 500
nGene > 250
log10GenesPerUMI > 0.8
mitoRatio < 0.2
# Get pre-filtering cell counts.
dim(merged_seurat)[2] # 86,347 cells
## [1] 86347
# Low-quality cell filtering.
umi_filtered <- subset(x = merged_seurat, subset = nUMI >= 500)
gene_filtered <- subset(x = umi_filtered, subset = nGene >= 250)
genesperumi_filtered <- subset(x = gene_filtered, subset = log10GenesPerUMI > 0.80)
mitoratio_filtered <- subset(x = genesperumi_filtered, subset = mitoRatio < 0.20)
# Get post-filtering Seurat object.
# Enter level of filtering desired here.
cell_filtered_seurat = mitoratio_filtered
# Get counts after low-quality cell filtering.
dim(cell_filtered_seurat)[2] # 80,505 cells.
## [1] 80505
We can visualize the amount of cells removed and the number of cells remaining for each filtering step in the plot below.
We also remove genes that have zero expression in all cells and genes that are only expressed in 10 or less cells as genes that are expressed in only a handful of cells are not meaningful and can bring down the averages for all the other cells that they are not expressed in.
# Extract count matrix.
counts <- GetAssayData(object = cell_filtered_seurat, layer = "counts")
# Get a logical vector for every gene that has more than zero counts per cell.
nonzero <- counts > 0
# Get a logical vector of genes that are expressed in 10 or more cells.
keep_genes <- Matrix::rowSums(nonzero) >= 10
# Subset counts to keep genes that are non-zero and expressed in 10 or more cells.
filtered_counts <- counts[keep_genes, ]
# Create new Seurat object with the subsetted counts.
filtered_seurat <- CreateSeuratObject(filtered_counts, meta.data = cell_filtered_seurat@meta.data)
We can visualize the amount of genes removed and the number of genes remaining for each filtering step in the plot below.
Let’s have a look of our expression distribution before normalization.
# Visualize expression distribution before normalization.
hist(colSums(filtered_seurat[["RNA"]]$counts),
breaks = 100,
col = "#212226",
main = "Pre-Normalization: Expression distribution",
xlab = "Sum of expression")
Before we make any comparisons across cells, we will apply a simple normalization. This is solely for the purpose of exploring the sources of variation in our data.
# Apply simple normalization to merged Seurat objects.
seurat_phase <- NormalizeData(filtered_seurat)
## Normalizing layer: counts
# Visualize expression distribution after simple normalization.
hist(colSums(seurat_phase[["RNA"]]$data),
breaks = 100,
col = "#b8520a",
main = "Post-Simple Normalization: Expression distribution",
xlab = "Sum of expression")
# Remove unneeded objects.
rm(filtered_seurat)
# Download cell cycle genes for organism at https://github.com/hbc/tinyatlas/tree/master/cell_cycle.
cc_file <- getURL("https://raw.githubusercontent.com/hbc/tinyatlas/master/cell_cycle/Mus_musculus.csv")
cell_cycle_genes <- read.csv(text = cc_file)
# Remove unneeded objects.
rm(cc_file)
All of the cell cycle genes are Ensembl IDs, but our gene IDs are the gene names. To score the genes in our count matrix for cell cycle, we need to obtain the gene names for the cell cycle genes.
# Connect to AnnotationHub.
ah <- AnnotationHub()
# Access the Ensembl database for organism.
ahDb <- query(ah,
pattern = c("Mus musculus", "EnsDb"),
ignore.case = TRUE)
# Acquire the latest annotation files.
id <- ahDb %>%
mcols() %>%
rownames() %>%
tail(n = 1)
# Download the appropriate Ensembldb database.
edb <- ah[[id]]
## loading from cache
# Extract gene-level information from database.
annotations <- genes(edb,
return.type = "data.frame")
# Select annotations of interest.
annotations <- annotations %>%
dplyr::select(gene_id, gene_name, seq_name, gene_biotype, description)
# Remove unneeded objects.
rm(ah, ahDb, edb, id)
We can merge the Ensembl genes to the cell cycle genes.
# Get gene names for Ensembl IDs for each gene.
cell_cycle_markers <- dplyr::left_join(cell_cycle_genes, annotations, by = c("geneID" = "gene_id"))
# Acquire the S phase genes.
s_genes <- cell_cycle_markers %>%
dplyr::filter(phase == "S") %>%
pull("gene_name")
# Acquire the G2M phase genes.
g2m_genes <- cell_cycle_markers %>%
dplyr::filter(phase == "G2/M") %>%
pull("gene_name")
# Remove unneeded objects.
rm(cell_cycle_genes, cell_cycle_markers, annotations)
We can then use the annotated cell cycle genes to perform cell cycle scoring.
# Perform cell cycle scoring
seurat_phase <- CellCycleScoring(seurat_phase,
g2m.features = g2m_genes,
s.features = s_genes)
# Remove unneeded objects.
rm(g2m_genes, s_genes)
# Identify the most variable genes.
seurat_phase <- FindVariableFeatures(seurat_phase,
selection.method = "vst",
nfeatures = 2000,
verbose = FALSE)
# Scale the counts.
seurat_phase <- ScaleData(seurat_phase)
## Centering and scaling data matrix
# Visualize expression distribution after scaling.
hist(colSums(seurat_phase[["RNA"]]$scale.data),
breaks = 100,
col = "#779e7f",
main = "Post-Scaling: Expression distribution",
xlab = "Sum of expression")
# Identify the 15 most highly variable genes.
ranked_variable_genes <- VariableFeatures(seurat_phase)
top_genes <- ranked_variable_genes[1:15]
# Plot the average expression and variance of these genes with labels to indicate which genes are in the top 15.
p <- VariableFeaturePlot(seurat_phase)
LabelPoints(plot = p, points = top_genes, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
# Remove unneeded objects.
rm(ranked_variable_genes, top_genes, p)
# Perform PCA.
seurat_phase <- RunPCA(seurat_phase)
## PC_ 1
## Positive: Ccdc153, Tmem212, 1110017D15Rik, Fam183b, Rsph1, Dynlrb2, Foxj1, Cfap126, Cyp2s1, Arhgdig
## Sec14l3, 1700007K13Rik, Cd24a, Ccdc113, Mlf1, Gm19935, Riiad1, 1700016K19Rik, Aldh3b1, 1700094D03Rik
## Pifo, Tspan1, BC051019, Cfap53, Cdhr3, Mns1, Tctex1d4, Sntn, Ak7, Traf3ip1
## Negative: Sparc, Vim, Igfbp7, Col4a1, Bgn, Mgp, Apoe, Col6a1, Adamts1, Col3a1
## Col1a2, Fstl1, Sparcl1, Col1a1, Lgals1, Eln, Ltbp4, Igfbp4, Fbn1, Serpinh1
## Col5a2, Col4a2, Selenom, Cfh, Loxl1, Serpina3n, Ptgis, Lrp1, Emp3, Colec12
## PC_ 2
## Positive: Sftpa1, Napsa, Cxcl15, Sftpd, Slc34a2, Lyz2, Lamp3, Cd74, Scd1, S100g
## Ager, Cldn18, Fabp5, H2-Aa, Chil1, Hc, Egfl6, Aqp5, Elovl1, Ctsh
## H2-Ab1, Bex4, Lrp2, Acoxl, Slc39a8, Chia1, Mal, Lcn2, Wfdc2, Bex2
## Negative: Col3a1, Col1a2, Serping1, Col1a1, Bgn, Mgp, Gsn, Loxl1, Lrp1, Lgals1
## Fbn1, Rbp1, Plxdc2, Mfap5, Mfap4, Pmepa1, Col5a2, Fn1, Mmp2, Fstl1
## Adamts2, Rarres2, Olfml3, Sod3, Sparcl1, Dpt, Col6a3, Colec12, Ptgis, Cfh
## PC_ 3
## Positive: Sftpd, Napsa, Cxcl15, Chil1, Slc34a2, Scd1, Sftpa1, Lamp3, Lyz2, S100g
## Cldn18, Ager, Egfl6, Hc, Fabp5, Aqp5, Selenbp1, Mgst1, Elovl1, Ctsh
## Bex4, Wfdc2, Lrp2, Hp, Cd74, Col6a1, Acoxl, Prdx6, Apoc1, Il33
## Negative: Ramp2, Cdh5, Cldn5, Egfl7, Calcrl, Pecam1, Ctla2a, Ptprb, Aqp1, Ehd4
## Rasip1, Ecscr, Flt1, Eng, Icam2, Plvap, S1pr1, Srgn, Cd93, Ly6a
## Tspan7, Cyyr1, Podxl, Lyve1, Cavin2, Slfn5, Gimap6, Acvrl1, Cemip2, Clec14a
## PC_ 4
## Positive: Inmt, Scn7a, Fmo2, Ogn, Ly6a, Tmem100, Pcolce2, Clec3b, Fmo1, Hpgd
## Dpep1, Plpp3, Mfap4, Acvrl1, C7, Mamdc2, Clec14a, Tspan7, Flt1, Sept4
## Tcf21, Atp1a2, Apoe, Ace, Iigp1, Gsn, Thbd, Hsd11b1, Gstm1, Abca8a
## Negative: Birc5, Mki67, Tpx2, Cdca8, Cdk1, Pclaf, Ccna2, Ube2c, Cdca3, Hmmr
## Top2a, Cenpf, Racgap1, Prc1, Cenpe, Anln, Ccnb1, Cenpa, Pbk, Plk1
## Ccnb2, Cdc20, Aurkb, Nusap1, Ckap2l, Knl1, Kif11, Knstrn, Cdkn3, Tacc3
## PC_ 5
## Positive: Fabp5, Chil1, Lyz2, Slc34a2, Lamp3, Cxcl15, Sftpa1, S100g, Hc, Scd1
## Cbr2, Acoxl, Lcn2, Il33, Egfl6, Lrg1, Bex2, H2-Aa, Napsa, Apoc1
## Elovl1, Itih4, Lrp2, Hp, Sftpd, Chia1, Bex4, H2-Ab1, Trf, Aox3
## Negative: Rtkn2, Col4a3, Akap5, Spock2, Scnn1g, Col4a4, Igfbp2, Krt7, Itgb6, Hopx
## Ndnf, Abca5, Flrt3, Clic5, Tmem37, Ano1, 2210011C24Rik, Gprc5a, Lama3, Cdkn2b
## Cryab, Emp2, Lamc2, Hck, Pxdc1, Pdpn, Nt5e, Tspan8, Fam189a2, Scnn1b
# Plot the PCA colored by cell cycle phase.
DimPlot(seurat_phase,
reduction = "pca",
group.by = "Phase",
split.by = "Phase")
# Split effects of PCA by treatment.
DimPlot(seurat_phase,
reduction = "pca",
group.by = "Phase",
split.by = "timepoint")
We don’t see large differences due to the effects of cell cycle, so cell cycle effects are not regressed out.
# Free up memory.
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 16322719 871.8 25595414 1367.0 NA 25595414 1367.0
## Vcells 776961035 5927.8 1515293976 11560.8 1024000 1894557522 14454.4
# Set memory limit.
options(future.globals.maxSize = 8000 * 1024^2)
# Perform SCTransform normalization.
seurat_sct <- SCTransform(seurat_phase,
vars.to.regress = c("mitoRatio"),
vst.flavor = "v2",
conserve.memory = TRUE,
verbose = FALSE)
## Will not return corrected UMI because residual type is not set to 'pearson'
# Check which objects are stored in which assays.
seurat_sct@assays
## $RNA
## Assay (v5) data with 22546 features for 80505 cells
## Top 10 variable features:
## Bpifa1, Scgb3a1, Bpifb1, Ltf, Reg3g, Cxcl13, Spp1, Retnla, Tff2, Saa3
## Layers:
## counts, data, scale.data
##
## $SCT
## SCTAssay data with 22546 features for 80505 cells, and 1 SCTModel(s)
## Top 10 variable features:
## Sftpc, Scgb1a1, Lyz1, Spp1, Mgp, Ccl21a, Scgb3a2, Lyz2, Bpifa1, Scgb3a1
# Remove unneeded objects.
rm(seurat_phase)
The SCTransform normalized counts are stored in the SCT$counts slot. Here is the distribution of the SCTransform normalized counts distribution.
# Visualize expression distribution after scaling.
hist(colSums(seurat_sct[["SCT"]]$counts),
breaks = 100,
col = "#332f42",
main = "Post-SCTransform: Expression distribution",
xlab = "Sum of expression")
The log-normalized SCTransfromed counts are also stored in the SCT$data slot. Here is the distribution of the log-normalized SCTransformed counts.
hist(colSums(seurat_sct[["SCT"]]$data),
breaks = 100,
col = "#b14b34",
main = "Post-SCTransformed: Log-normalized expression distribution",
xlab = "Sum of expression")
The goal of this step is to perform unsupervised clustering to identify groups of cells based on similarities of their transcriptomes without any prior knowledge of the labels or the number of clusters. This can be challenging due to the high level of noise and large number of dimensions, so it is often beneficial to apply a dimensionality reduction method to reduce the amount of noise.
# Run PCA.
seurat_sct <- RunPCA(seurat_sct, verbose = FALSE)
# Visualize dimension reduction from PCA.
seurat_sct <- RunUMAP(seurat_sct, dims = 1:40, reduction = "pca")
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 11:38:05 UMAP embedding parameters a = 0.9922 b = 1.112
## 11:38:05 Read 80505 rows and found 40 numeric columns
## 11:38:05 Using Annoy for neighbor search, n_neighbors = 30
## 11:38:05 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 11:38:12 Writing NN index file to temp file /var/folders/1c/sbnd3yvd7mzg5lng1jcr70tw0000gn/T//RtmpG4X7bv/fileaad2c52b519
## 11:38:12 Searching Annoy index using 1 thread, search_k = 3000
## 11:38:38 Annoy recall = 100%
## 11:38:40 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 11:38:43 Initializing from normalized Laplacian + noise (using RSpectra)
## 11:38:48 Commencing optimization for 200 epochs, with 3408926 positive edges
## 11:39:21 Optimization finished
DimPlot(object = seurat_sct, group.by = "treatment")
# Explore the top PC markers.
DimHeatmap(seurat_sct,
dims = 1:9,
cells = 500,
balanced = TRUE)
We can visualize an elbow plot to see how many PCs to use for clustering.
# Build the elbow plot.
# Determine percent of variation associated with each PC.
pct <- seurat_sct[["pca"]]@stdev / sum(seurat_sct[["pca"]]@stdev) * 100
# Calculate cumulative percents for each PC.
cumu <- cumsum(pct)
# Determine which PC exhibits cumulative percent greater than 90% and % variation associated with the PC as less than 5.
co1 <- which(cumu > 90 & pct < 5)[1]
co1
## [1] 41
# Determine the difference between variation of PC and subsequent PC.
co2 <- sort(which((pct[1:length(pct) - 1] - pct[2:length(pct)]) > 0.1), decreasing = T)[1] + 1
# Last point where change of % of variation is more than 0.1%.
co2
## [1] 25
# Minimum of the two calculation.
pcs <- min(co1, co2)
pcs
## [1] 25
# Create a dataframe with values.
elbowplot_df <- data.frame(pct = pct,
cumu = cumu,
rank = 1:length(pct))
# Elbow plot to visualize.
ggplot(elbowplot_df, aes(cumu, pct, label = rank, color = rank > pcs)) +
geom_text() +
geom_vline(xintercept = 90, color = "grey") +
geom_hline(yintercept = min(pct[pct > 5]), color = "grey") +
theme_bw()
# Remove unneeded objects.
rm(co1, co2, cumu, elbowplot_df, pcs, pct)
Based on the elbow plot, we will use approx. 15 PCs to generate the clusters.
The basic steps towards complete clustering involve:
Measure of similarity: How does one quantify how close two data points are?
Quality function: How does one define the clustering/partition of the points?
Algorithm: How to find the clustering whereby the quality function is optimized?
We use a graph-based community detection method where each vertex represents a cell and the weight of the edge measures similarities between two cells. Put simply, we start by building an unweighted K-nearest neighbour (k-NN) graph and add weights to obtain a shared nearest neighbour (SNN) graph. Weight can be added either by finding out the shared nodes (overlap) between two neighbours (more nodes; more similar the neighbours; more weight) or measuring the closeness of both neighbours to each other.
# Compute SNN graph.
seurat_cluster <- FindNeighbors(object = seurat_sct, dims = 1:15) # Based on elbow plot.
## Computing nearest neighbor graph
## Computing SNN
# Remove unneeded object.
rm(seurat_sct)
Next, we want to iteratively group the neighbours together with the goal of combining the cells with similar transcriptomes together. Without knowing the number of clusters a priori, we use a range of resolution based on the approximation of the recommendation where datasets of 3,000 - 5,000 cells should set the resolution between 0.4 - 1.4. We have approximately 90,000 cells, so our range will start from 1.5 - 2.5.
# Determine start and end resolution range.
start_reso_range = 0.2
end_reso_range = 1.0
range_interval = 0.2
# Find clusters given the range.
seurat_cluster <- FindClusters(object = seurat_cluster,
resolution = seq(from = start_reso_range,
to = end_reso_range,
by = range_interval))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 80505
## Number of edges: 2500763
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9649
## Number of communities: 13
## Elapsed time: 34 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 80505
## Number of edges: 2500763
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9498
## Number of communities: 23
## Elapsed time: 27 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 80505
## Number of edges: 2500763
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9379
## Number of communities: 26
## Elapsed time: 31 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 80505
## Number of edges: 2500763
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9275
## Number of communities: 32
## Elapsed time: 32 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 80505
## Number of edges: 2500763
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9193
## Number of communities: 39
## Elapsed time: 28 seconds
# Remove unneeded objects.
rm(start_reso_range, end_reso_range, range_interval)
We can look at the maximum number of clusters found at each resolution with the output below.
# Create a dummy table to contain the max cluster for each resolution.
maxclust = c()
clustoutput = seurat_cluster@meta.data %>% subset(select = grep("SCT_snn_", colnames(.)))
clustoutput[] <- lapply(clustoutput, function(x) as.numeric(as.character(x)))
# Iterate and print the maximum cluster count for each resolution.
for (i in 1:ncol(clustoutput)){
maxclust[i] = max(clustoutput[, i])
print(paste0("Max n-cluster for ",
names(clustoutput[i]), ": ",
maxclust[i]))
}
## [1] "Max n-cluster for SCT_snn_res.0.2: 12"
## [1] "Max n-cluster for SCT_snn_res.0.4: 22"
## [1] "Max n-cluster for SCT_snn_res.0.6: 25"
## [1] "Max n-cluster for SCT_snn_res.0.8: 31"
## [1] "Max n-cluster for SCT_snn_res.1: 38"
# Remove unneeded objects.
rm(clustoutput, i, maxclust)
We can visualize the difference clusterings of the resolutions side by side. In this case, we will look at these resolutions: 0.6, 0.8, 1.0.
# Select resolutions to visualize.
resolutions = c("0.6", "0.8", "1")
feature_cols = paste0("SCT_snn_res.", resolutions)
# Visualize UMAP of different resolutions.
for (reso in feature_cols) {
Idents(seurat_cluster) <- reso
p <- DimPlot(seurat_cluster,
reduction = "umap",
label = TRUE,
label.size = 3) +
ggtitle(reso) +
NoLegend()
print(p)
}
# Remove unneeded objects.
rm(feature_cols, p, reso, resolutions)
Let’s start with the exploration with the highest resolution in the range, 1.0.
# Sort the cluster labels in ascending order.
reso_columns = grep("SCT_snn_", colnames(seurat_cluster@meta.data), value = TRUE)
metadata <- seurat_cluster@meta.data
for (col in colnames(metadata)) {
if (col %in% reso_columns){
metadata[, col] <- factor(metadata[, col], levels = paste(sort(as.integer(levels(metadata[, col])))))
}
}
seurat_cluster@meta.data <- metadata
# Assign desired cluster resolution as identity of Seurat.
Idents(object = seurat_cluster) <- "SCT_snn_res.1"
# Visualize cluster with a UMAP.
DimPlot(seurat_cluster,
reduction = "umap",
label = TRUE,
label.size = 3,
repel = TRUE) +
ggtitle("SCT_snn_res.1.0") +
NoLegend()
# Remove unneeded objects.
rm(col, metadata, reso_columns)
# Extract number of cells per cluster.
n_cells <- FetchData(seurat_cluster,
vars = c("ident", "treatment")) %>%
dplyr::count(ident, treatment)
# Visualize distribution of cells by cluster in a barplot.
ggplot(n_cells, aes(x = ident, y = n, fill = treatment)) +
geom_bar(position = position_dodge(),
stat = "identity") +
geom_text(aes(label = n),
size = 2,
vjust = -0.2,
position = position_dodge(1)) +
theme_classic() +
theme(axis.text.x = element_text(size = 5))
# Visualize distribution of treatment in each cluster.
DimPlot(seurat_cluster,
label = TRUE,
split.by = "treatment",
label.size = 3,
repel = TRUE) +
theme(axis.text.x = element_text(size = 6),
axis.text.y = element_text(size = 6)) +
NoLegend()
# Remove unneeded object.
rm(n_cells)
We would like to see if there is any enrichment of mitochondria in our clusters and if we need to regress out the mitochondrial level in our data.
# Plot mitoRatio to see if any cluster is enriched in mitochondrial level.
FeaturePlot(seurat_cluster,
reduction = "umap",
features = "mitoRatio",
cols = c("#d6cfc7", "#01013f"),
pt.size = 0.5,
order = TRUE,
min.cutoff = "q10",
label = TRUE,
label.size = 3,
label.color = "#960000")
There are certain clusters enriched with mitochondria, but they are within the acceptable ratio of under 0.20.
If our cell clusters show large differences in cell cycle expression, we might want to regress out the effects of cell cycle phase.
# Explore whether clusters segregate by cell cycle phase.
DimPlot(seurat_cluster,
label = TRUE,
split.by = "Phase",
label.size = 4,
repel = TRUE) +
theme(axis.text.x = element_text(size = 6),
axis.text.y = element_text(size = 6)) +
NoLegend()
We don’t see any noticeable cell cycle phase effects in our
clusters.
We can explore if any specific PCs drive the separation of the clusters.
# Define columns in the metadata to visualize.
columns = c(paste0("PC_", 1:12), "ident", "umap_1", "umap_2")
# Extract data from Seurat object based on column names.
pc_data <- FetchData(seurat_cluster, vars = columns)
# Add cluster label to the centre of cluster on UMAP.
umap_label = FetchData(seurat_cluster,
vars = c("ident", "umap_1", "umap_2")) %>%
group_by(ident) %>%
summarise(x = mean(umap_1), y = mean(umap_2))
# Plot a UMAP plot for each of the PCs.
map(paste0("PC_", 1:12), function(pc){
ggplot(pc_data, aes(umap_1, umap_2)) +
geom_point(aes_string(color = pc), alpha = 0.7) +
scale_color_gradient(guide = "none", low = "#d6cfc7", high = "#01013f") +
geom_text(data = umap_label, aes(label = ident, x, y), color = "#960000", size = 4) +
theme_classic() +
ggtitle(pc)}) %>%
plot_grid(plotlist = .)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Remove unneeded objects.
rm(columns, pc_data, umap_label)
We can print the genes driving different PCs and correlate those metagenes with the identity of the clusters.
# Print PC metagenes.
print(seurat_cluster[["pca"]], dims = 1:12, nfeatures = 5)
## PC_ 1
## Positive: Mgp, Col3a1, Gsn, Col1a2, Col1a1
## Negative: Sftpc, Lyz2, Sftpa1, Cxcl15, Sftpb
## PC_ 2
## Positive: Mgp, Sftpc, Col3a1, Lyz2, Col1a2
## Negative: Sec14l3, Cyp2f2, AU040972, Tmem212, Aldh1a1
## PC_ 3
## Positive: Ctla2a, Vwf, Cldn5, Lyve1, Ptprb
## Negative: Mgp, Aldh1a1, Cyp2f2, Sec14l3, AU040972
## PC_ 4
## Positive: Ager, Hopx, Rtkn2, Igfbp2, S100a6
## Negative: Gsn, Dcn, Vwf, Lyz2, Sftpc
## PC_ 5
## Positive: Gsn, Inmt, Ager, Hopx, Mfap4
## Negative: Spp1, Timp1, Ccl21a, Acta2, Fn1
## PC_ 6
## Positive: Spp1, Vwf, Timp1, Acta2, Tagln
## Negative: Ccl21a, Mmrn1, Prox1, Flt4, Reln
## PC_ 7
## Positive: Dcn, Col1a2, Igfbp4, Col1a1, Pi16
## Negative: Inmt, Mfap4, Scgb1a1, Gpx3, Npnt
## PC_ 8
## Positive: Inmt, Ager, Sftpc, Hopx, Sec14l3
## Negative: Scgb1a1, Scgb3a2, Scgb3a1, Bpifa1, Cyp2f2
## PC_ 9
## Positive: Car4, Cldn5, Tmem100, Kdr, Dcn
## Negative: Vwf, Selp, Fabp4, Vcam1, Aqp1
## PC_ 10
## Positive: Acta2, Tagln, Gsn, Tpm2, Dcn
## Negative: Serpina3n, Eln, Mgp, Saa3, Fn1
## PC_ 11
## Positive: Lyz1, Cbr2, Malat1, Thbs1, Cdkn1c
## Negative: S100a6, Cd177, Wfdc2, AU040972, Cxcl17
## PC_ 12
## Positive: Lyz2, Eln, Col1a1, Scgb1a1, Col1a2
## Negative: Lyz1, S100a6, Cd177, Clu, Wfdc2
The SCTransformed data was performed only on the 3000 most variable genes. Ideally, we want to explore the identity of the clusters with all the genes in the dataset and not just the top 3000 most variable genes, so we need to set the default assay back to the RNA assay.
# Set default assay back to RNA.
DefaultAssay(seurat_cluster) <- "RNA"
We can explore canonical cell type markers of our cell types of interest and visualize their expression on our UMAP. Canonical markers are obtained from LungMAP and A molecular cell atlas of the human lung from single-cell RNA sequencing (2020) Nature
AT1 cells are defined by the cell markers: Hopx, Ager, Rtkn2, Pdpn and Clic5. We can visualize both the UMAP and Dotplot to see which cluster(s) do these markers localize.
# Visualize AT1 markers expression on dotplot.
DotPlot(seurat_cluster,
features = c("Hopx", "Ager", "Rtkn2", "Pdpn", "Clic5"),
cluster.idents = TRUE,
cols = c("#d6cfc7", "#01013f"),
dot.scale = 6,
scale.by = "radius") +
ggtitle("AT1 markers") +
theme(plot.title = element_text(hjust = 0.5))
AT2 cells are defined by the cell markers: Sftpb, Sftpc, Sftpd, Muc1, Etv5 and Lamp3.
# Visualize AT2 markers expression on dotplot.
DotPlot(seurat_cluster,
features = c("Sftpb", "Sftpc", "Sftpd", "Muc1", "Etv5", "Lamp3"),
cluster.idents = TRUE,
cols = c("#d6cfc7", "#01013f"),
dot.scale = 6,
scale.by = "radius") +
ggtitle("AT2 markers") +
theme(plot.title = element_text(hjust = 0.5))
Alveolar fibroblast cells are defined by the cell markers: Pdgfra, Tcf21, Wnt2, Mfap5, Col1a1.
Pericytes are defined by the cell markers: Pdgfrb, Trpc6, Cspg4.
# Visualize alveolar fibroblast markers expression on dotplot.
DotPlot(seurat_cluster,
features = c("Tcf21", "Wnt2", "Mfap5", "Pdgfra", "Pdgfrb", "Trpc6", "Col1a1"),
cluster.idents = TRUE,
cols = c("#d6cfc7", "#01013f"),
dot.scale = 6,
scale.by = "radius") +
ggtitle("Fibroblast markers") +
theme(plot.title = element_text(hjust = 0.5))
Myofibroblast cells are defined by the cell markers: Wt1, Fgf18, Dach2, Frem2, Eln, Acta2.
# Visualize myofibroblast markers expression on dotplot.
DotPlot(seurat_cluster,
features = c("Wt1", "Fgf18", "Dach2", "Frem2", "Acta2"),
cluster.idents = TRUE,
cols = c("#d6cfc7", "#01013f"),
dot.scale = 6,
scale.by = "radius") +
ggtitle("Myofibroblast markers") +
theme(plot.title = element_text(hjust = 0.5))
Azimuth uses a reference-based mapping with the Human Biomolecular Atlas Project, containing references for multiple human tissues.
# Check all available Azimuth references.
available_data <- AvailableData()
available_data[grep("Azimuth", available_data[, 3]), 1:3] # Use lungref.
## Dataset Version Summary
## adiposeref.SeuratData adiposeref 1.0.0 Azimuth Reference: adipose
## bonemarrowref.SeuratData bonemarrowref 1.0.0 Azimuth Reference: bonemarrow
## fetusref.SeuratData fetusref 1.0.0 Azimuth Reference: fetus
## heartref.SeuratData heartref 1.0.0 Azimuth Reference: heart
## humancortexref.SeuratData humancortexref 1.0.0 Azimuth Reference: humancortex
## kidneyref.SeuratData kidneyref 1.0.2 Azimuth Reference: kidney
## lungref.SeuratData lungref 2.0.0 Azimuth Reference: lung
## mousecortexref.SeuratData mousecortexref 1.0.0 Azimuth Reference: mousecortex
## pancreasref.SeuratData pancreasref 1.0.0 Azimuth Reference: pancreas
## pbmcref.SeuratData pbmcref 1.0.0 Azimuth Reference: pbmc
## tonsilref.SeuratData tonsilref 2.0.0 Azimuth Reference: tonsil
# Run Azimuth with lungref.
options(timeout = 1000)
seurat_annotated <- RunAzimuth(seurat_cluster, reference = "lungref", assay = "RNA")
## Warning: Overwriting miscellanous data for model
## Warning: Adding a dimensional reduction (refUMAP) without the associated assay
## being present
## detected inputs from MOUSE with id type Gene.name
## reference rownames detected HUMAN with id type Gene.name
## Found 16100 out of 22546 total inputs in conversion table
## Normalizing query using reference SCT model
## Warning: No layers found matching search pattern provided
## Warning: 889 features of the features specified were not present in both the reference query assays.
## Continuing with remaining 2111 features.
## Projecting cell embeddings
## Finding query neighbors
## Finding neighborhoods
## Finding anchors
## Found 17335 anchors
## Finding integration vectors
## Finding integration vector weights
## Predicting cell labels
## Predicting cell labels
## Predicting cell labels
## Predicting cell labels
## Predicting cell labels
## Predicting cell labels
##
## Integrating dataset 2 with reference dataset
## Finding integration vectors
## Integrating data
## Warning: Keys should be one or more alphanumeric characters followed by an
## underscore, setting key from integrated_dr_ to integrateddr_
## Computing nearest neighbors
## Running UMAP projection
## Warning in RunUMAP.default(object = neighborlist, reduction.model =
## reduction.model, : Number of neighbors between query and reference is not equal
## to the number of neighbors within reference
## 11:50:42 Read 80505 rows
## 11:50:42 Processing block 1 of 1
## 11:50:42 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 20
## 11:50:42 Initializing by weighted average of neighbor coordinates using 1 thread
## 11:50:42 Commencing optimization for 67 epochs, with 1610100 positive edges
## 11:50:49 Finished
## Warning: No assay specified, setting assay as RNA by default.
## Projecting reference PCA onto query
## Finding integration vector weights
## Projecting back the query cells into original PCA space
## Finding integration vector weights
## Computing scores:
## Finding neighbors of original query cells
## Finding neighbors of transformed query cells
## Computing query SNN
## Determining bandwidth and computing transition probabilities
## Total elapsed time: 54.9975788593292
# Remove unneeded object.
rm(available_data)
# Assign desired cluster resolution as identity of Seurat.
Idents(object = seurat_annotated) <- "predicted.ann_finest_level"
# Visualize annotated cluster with a UMAP.
DimPlot(seurat_annotated,
reduction = "umap",
group.by = "predicted.ann_finest_level",
label = TRUE,
label.size = 4,
repel = TRUE) +
ggtitle("Azimuth annotation") +
theme(plot.title = element_text(hjust = 0.5)) +
NoLegend()
## Warning: ggrepel: 6 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
# Visualize annotation prediction score.
FeaturePlot(seurat_annotated,
reduction = "umap",
features = "mapping.score",
cols = c("#01013f", "#59974d", "#fff000")) +
theme(axis.text.x = element_text(size = 8),
axis.text.y = element_text(size = 8)) +
ggtitle("Annotation prediction score")
# Get metadata.
annotation_meta = seurat_annotated@meta.data
# Subset the required columns.
annotation_meta <- annotation_meta %>% subset(select = c(SCT_snn_res.1, predicted.ann_finest_level, mapping.score))
# Group the table by cluster and annotation.
annotation_meta <- annotation_meta %>%
dplyr::rename(cluster = "SCT_snn_res.1",
cluster_annotation = "predicted.ann_finest_level") %>%
group_by(cluster, cluster_annotation) %>%
summarise(mean_mapping_score = mean(mapping.score))
## `summarise()` has grouped output by 'cluster'. You can override using the
## `.groups` argument.
# Visualize the probable annotations for each cluster.
annotation_meta <- annotation_meta %>% arrange(cluster, desc(mean_mapping_score)) %>% slice_head(n = 3)
annotation_meta
## # A tibble: 107 × 3
## # Groups: cluster [39]
## cluster cluster_annotation mean_mapping_score
## <fct> <chr> <dbl>
## 1 0 AT2 0.642
## 2 1 Smooth muscle 1
## 3 1 AT2 proliferating 0.724
## 4 1 AT2 0.678
## 5 2 Club (non-nasal) 0.990
## 6 2 Lymphatic EC mature 0.938
## 7 2 Adventitial fibroblasts 0.854
## 8 3 Suprabasal 0.992
## 9 3 Pericytes 0.955
## 10 3 Smooth muscle 0.911
## # ℹ 97 more rows
# Remove unneeded object.
rm(annotation_meta)
We can cross-check the reference-based annotation with the ScType annotation for validation.
# Load in ScType functions.
source("~/Documents/Work/UnityHealth/Data_Projects/auto_detect_tissue_type_KK.R")
source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/gene_sets_prepare.R")
source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/sctype_score_.R")
# Set path to ScType reference database.
db_ = "https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/ScTypeDB_full.xlsx"
# Check tissue type for validation.
tissue_guess <- auto_detect_tissue_type(path_to_db_file = db_,
seuratObject = seurat_cluster,
scaled = TRUE,
assay = "SCT")
## [1] "Checking...Immune system"
## Selecting by scores
## [1] "Checking...Pancreas"
## Selecting by scores
## [1] "Checking...Liver"
## Selecting by scores
## [1] "Checking...Eye"
## Selecting by scores
## [1] "Checking...Kidney"
## Selecting by scores
## [1] "Checking...Brain"
## Selecting by scores
## [1] "Checking...Lung"
## Selecting by scores
## [1] "Checking...Adrenal"
## Selecting by scores
## [1] "Checking...Heart"
## Selecting by scores
## [1] "Checking...Intestine"
## Selecting by scores
## [1] "Checking...Muscle"
## Selecting by scores
## [1] "Checking...Placenta"
## Selecting by scores
## [1] "Checking...Spleen"
## Selecting by scores
## [1] "Checking...Stomach"
## Selecting by scores
## [1] "Checking...Thymus"
## Selecting by scores
## [1] "Checking...Hippocampus"
## Selecting by scores
# Define Seurat object.
seuobj = seurat_cluster
# Specify tissue of interest.
tissue = "Lung"
# Prepare gene sets.
gs_list <- gene_sets_prepare(db_, tissue)
# Check Seurat object version - count matrix is extracted differently between versions.
seurat_package_v5 <- isFALSE("counts" %in% names(attributes(seuobj[["RNA"]])))
print(sprintf("Seurat object %s is used", ifelse(seurat_package_v5, "v5", "v4")))
## [1] "Seurat object v5 is used"
# Extract scaled scRNAseq matrix.
scRNAseqData_scaled <- if (seurat_package_v5) as.matrix(seuobj[["RNA"]]$scale.data) else
as.matrix(seuobj[["RNA"]]@scale.data)
# Run ScType.
es.max <- sctype_score(scRNAseqData = scRNAseqData_scaled,
scaled = TRUE,
gs = gs_list$gs_positive,
gs2 = gs_list$gs_negative)
# Merge ScType results by clusters.
cL_results <- do.call("rbind",
lapply(unique(seuobj@meta.data$seurat_clusters),
function(cl){es.max.cl = sort(
rowSums(
es.max[, rownames(
seuobj@meta.data[
seuobj@meta.data$seurat_clusters == cl, ])]),
decreasing = !0)
head(data.frame(cluster = cl,
type = names(es.max.cl),
scores = es.max.cl,
ncells = sum(seuobj@meta.data$seurat_clusters == cl)), 10)
}))
sctype_scores <- cL_results %>% group_by(cluster) %>% top_n(n = 1, wt = scores)
# Set low-confident (low ScType score) clusters to "Unknown".
sctype_scores$type[as.numeric(as.character(sctype_scores$scores)) < sctype_scores$ncells/4] <- "Unknown"
print(sctype_scores[, 1:3])
## # A tibble: 39 × 3
## # Groups: cluster [39]
## cluster type scores
## <fct> <chr> <dbl>
## 1 8 Pulmonary alveolar type II cells 9640.
## 2 1 Pulmonary alveolar type II cells 14485.
## 3 4 Fibroblasts 20590.
## 4 15 Pulmonary alveolar type II cells 6309.
## 5 3 Endothelial cell 20232.
## 6 0 Pulmonary alveolar type II cells 18099.
## 7 25 Pulmonary alveolar type II cells 3083.
## 8 20 Fibroblasts 2710.
## 9 19 Pulmonary alveolar type I cells 13307.
## 10 28 Pulmonary alveolar type II cells 1448.
## # ℹ 29 more rows
# Add annotated cell types to the metadata.
seurat_cluster@meta.data$sctype_annotation = ""
for(j in unique(sctype_scores$cluster)){
cl_type = sctype_scores[sctype_scores$cluster==j, ]
seurat_cluster@meta.data$sctype_annotation[seurat_cluster@meta.data$seurat_clusters == j] = as.character(cl_type$type[1])
}
# Plot UMAP with ScType annotation.
DimPlot(seurat_cluster,
reduction = "umap",
label = TRUE,
label.size = 3,
repel = TRUE,
group.by = "sctype_annotation") +
NoLegend()
# Remove unneeded objects.
rm(auto_detect_tissue_type, cL_results, cl_type, db_, es.max,
gene_sets_prepare, gs_list, j, scRNAseqData_scaled, sctype_score,
sctype_scores, seuobj, seurat_package_v5, tissue, tissue_guess)
# Add metadata from seurat_annotated, which contains the Azimuth annotation and ScType annotation to seurat_cluster.
seurat_cluster@meta.data <- seurat_annotated@meta.data
# Get Ensembl annotation data.
# Connect to AnnotationHub.
ah <- AnnotationHub()
# Access the Ensembl database for organism.
ahDb <- query(ah,
pattern = c("Mus musculus", "EnsDb"),
ignore.case = TRUE)
# Acquire the latest annotation files.
id <- ahDb %>%
mcols() %>%
rownames() %>%
tail(n = 1)
# Download the appropriate Ensembldb database.
edb <- ah[[id]]
## loading from cache
# Extract gene-level information from database.
annotations <- genes(edb, return.type = "data.frame")
# Select annotations of interest.
annotations <- annotations %>%
dplyr::select(gene_id, gene_name, seq_name, gene_biotype, description)
# Remove unneeded objects.
rm(ah, ahDb, edb, id)
Based on the Azimuth annotation, let’s visualize which fibroblast expresses PDGFR-a.
# Plot the marker expression in the annotated UMAP.
Idents(seurat_cluster) <- "predicted.ann_finest_level"
pdgfra_umap <- FeaturePlot(seurat_cluster,
reduction = "umap",
features = "Pdgfra",
label = FALSE,
order = TRUE,
cols = c("#01013f", "#59974d", "#fff000")) +
ggtitle("PDGFR-a expression") +
theme(axis.text.x = element_text(size = 8),
axis.text.y = element_text(size = 8))
LabelClusters(pdgfra_umap, id = "ident", color = "#e97451", fontface = "bold", repel = TRUE)
## Warning: ggrepel: 6 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
# Confirm the cluster(s) that express the marker.
DotPlot(seurat_cluster, features = "Pdgfra",
assay = "RNA", cols = c("#d6cfc7", "#01013f"))
It looks like PDGFR-a is mostly expressed by alveolar fibroblasts and peribronchial fibroblasts. Let’s see if we can find out the markers that define this fibroblast.
# Find the cell counts for pericytes in each treatment.
n_cells <- FetchData(seurat_cluster,
vars = c("predicted.ann_finest_level", "treatment")) %>%
dplyr::count(predicted.ann_finest_level, treatment) %>%
subset(., predicted.ann_finest_level %in% c("Alveolar fibroblasts", "Peribronchial fibroblasts"))
# Examine the number of cells between both treatments.
ggplot(n_cells, aes(x = predicted.ann_finest_level, y = n, fill = treatment)) +
geom_bar(position = position_dodge(), stat = "identity") +
geom_text(aes(label = n), size = 2.5, vjust = -0.2, position = position_dodge(1)) +
theme_classic() +
theme(axis.text.x = element_text(size = 8))
# Find conserved markers in the alveolar fibroblast cluster.
source("~/Documents/Work/UnityHealth/Data_Projects/technical_scripts/get_conserved_markers.R")
for (cluster in c("Alveolar fibroblasts", "Peribronchial fibroblasts")) {
PDGFRA_markers <- get_conserved_markers(seuobj = seurat_cluster,
annotations = annotations,
cluster = cluster,
expt_condition = "treatment")
assign(cluster, PDGFRA_markers)
}
## Testing group Sham: (Alveolar fibroblasts) vs (AT2, EC general capillary, Adventitial fibroblasts, AT1, Transitional Club-AT2, Multiciliated (non-nasal), EC aerocyte capillary, Basal resting, EC arterial, Lymphatic EC mature, EC venous systemic, Club (non-nasal), Peribronchial fibroblasts, EC venous pulmonary, AT2 proliferating, Suprabasal, Myofibroblasts, Fibromyocytes, Smooth muscle, Lymphatic EC differentiating, Lymphatic EC proliferating, DC2, Club (nasal), SM activated stress response, Pericytes, Goblet (nasal), Deuterosomal, Mast cells, Neuroendocrine, CD8 T cells, CD4 T cells)
## Testing group Bleomycin: (Alveolar fibroblasts) vs (EC venous systemic, Multiciliated (non-nasal), Adventitial fibroblasts, AT1, EC general capillary, Lymphatic EC differentiating, Lymphatic EC mature, AT2, Peribronchial fibroblasts, CD8 T cells, EC aerocyte capillary, Smooth muscle, EC arterial, Myofibroblasts, EC venous pulmonary, SM activated stress response, Lymphatic EC proliferating, Goblet (nasal), Basal resting, Neuroendocrine, Club (non-nasal), DC2, Transitional Club-AT2, Ionocyte, CD4 T cells, Pericytes, Club (nasal), AT2 proliferating, Fibromyocytes, Suprabasal, B cells, Non-classical monocytes, Mast cells, SMG duct, Plasma cells, Mesothelium, Alveolar macrophages, Subpleural fibroblasts, Multiciliated (nasal), Deuterosomal)
## Testing group Sham: (Peribronchial fibroblasts) vs (AT2, Alveolar fibroblasts, EC general capillary, Adventitial fibroblasts, AT1, Transitional Club-AT2, Multiciliated (non-nasal), EC aerocyte capillary, Basal resting, EC arterial, Lymphatic EC mature, EC venous systemic, Club (non-nasal), EC venous pulmonary, AT2 proliferating, Suprabasal, Myofibroblasts, Fibromyocytes, Smooth muscle, Lymphatic EC differentiating, Lymphatic EC proliferating, DC2, Club (nasal), SM activated stress response, Pericytes, Goblet (nasal), Deuterosomal, Mast cells, Neuroendocrine, CD8 T cells, CD4 T cells)
## Testing group Bleomycin: (Peribronchial fibroblasts) vs (EC venous systemic, Multiciliated (non-nasal), Adventitial fibroblasts, AT1, EC general capillary, Lymphatic EC differentiating, Lymphatic EC mature, AT2, Alveolar fibroblasts, CD8 T cells, EC aerocyte capillary, Smooth muscle, EC arterial, Myofibroblasts, EC venous pulmonary, SM activated stress response, Lymphatic EC proliferating, Goblet (nasal), Basal resting, Neuroendocrine, Club (non-nasal), DC2, Transitional Club-AT2, Ionocyte, CD4 T cells, Pericytes, Club (nasal), AT2 proliferating, Fibromyocytes, Suprabasal, B cells, Non-classical monocytes, Mast cells, SMG duct, Plasma cells, Mesothelium, Alveolar macrophages, Subpleural fibroblasts, Multiciliated (nasal), Deuterosomal)
# Extract top 20 markers in the cluster.
PDGFRA_markers <- rbind(`Alveolar fibroblasts`, `Peribronchial fibroblasts`) %>%
mutate(avg_fc = (Bleomycin_avg_log2FC + Sham_avg_log2FC)/2) %>%
group_by(cluster_id) %>%
top_n(n = 20,
wt = avg_fc)
kable(PDGFRA_markers)
| cluster_id | gene | Sham_p_val | Sham_avg_log2FC | Sham_pct.1 | Sham_pct.2 | Sham_p_val_adj | Bleomycin_p_val | Bleomycin_avg_log2FC | Bleomycin_pct.1 | Bleomycin_pct.2 | Bleomycin_p_val_adj | max_pval | minimump_p_val | description | avg_fc |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Alveolar fibroblasts | Itga8 | 0 | 5.710093 | 0.782 | 0.024 | 0 | 0 | 4.121631 | 0.619 | 0.077 | 0 | 0 | 0 | integrin alpha 8 [Source:MGI Symbol;Acc:MGI:109442] | 4.915862 |
| Alveolar fibroblasts | Col13a1 | 0 | 5.904985 | 0.609 | 0.014 | 0 | 0 | 3.928055 | 0.569 | 0.060 | 0 | 0 | 0 | collagen, type XIII, alpha 1 [Source:MGI Symbol;Acc:MGI:1277201] | 4.916520 |
| Alveolar fibroblasts | Slc38a5 | 0 | 5.791733 | 0.506 | 0.012 | 0 | 0 | 4.031316 | 0.369 | 0.033 | 0 | 0 | 0 | solute carrier family 38, member 5 [Source:MGI Symbol;Acc:MGI:2148066] | 4.911524 |
| Alveolar fibroblasts | Scube2 | 0 | 6.161223 | 0.491 | 0.015 | 0 | 0 | 4.361700 | 0.353 | 0.035 | 0 | 0 | 0 | signal peptide, CUB domain, EGF-like 2 [Source:MGI Symbol;Acc:MGI:1928765] | 5.261462 |
| Alveolar fibroblasts | Fgfr4 | 0 | 6.950079 | 0.312 | 0.004 | 0 | 0 | 4.793985 | 0.188 | 0.011 | 0 | 0 | 0 | fibroblast growth factor receptor 4 [Source:MGI Symbol;Acc:MGI:95525] | 5.872032 |
| Alveolar fibroblasts | Slc7a10 | 0 | 7.778898 | 0.249 | 0.002 | 0 | 0 | 5.858652 | 0.118 | 0.003 | 0 | 0 | 0 | solute carrier family 7 (cationic amino acid transporter, y+ system), member 10 [Source:MGI Symbol;Acc:MGI:1858261] | 6.818775 |
| Alveolar fibroblasts | Amph | 0 | 5.880662 | 0.168 | 0.004 | 0 | 0 | 4.159611 | 0.119 | 0.012 | 0 | 0 | 0 | amphiphysin [Source:MGI Symbol;Acc:MGI:103574] | 5.020136 |
| Alveolar fibroblasts | Adra1a | 0 | 6.820929 | 0.138 | 0.002 | 0 | 0 | 4.349089 | 0.112 | 0.007 | 0 | 0 | 0 | adrenergic receptor, alpha 1a [Source:MGI Symbol;Acc:MGI:104773] | 5.585009 |
| Alveolar fibroblasts | Slc22a3 | 0 | 5.987598 | 0.123 | 0.003 | 0 | 0 | 4.127269 | 0.072 | 0.005 | 0 | 0 | 0 | solute carrier family 22 (organic cation transporter), member 3 [Source:MGI Symbol;Acc:MGI:1333817] | 5.057434 |
| Alveolar fibroblasts | Adrb3 | 0 | 5.844961 | 0.117 | 0.004 | 0 | 0 | 3.920863 | 0.076 | 0.009 | 0 | 0 | 0 | adrenergic receptor, beta 3 [Source:MGI Symbol;Acc:MGI:87939] | 4.882912 |
| Alveolar fibroblasts | Colq | 0 | 7.685366 | 0.092 | 0.001 | 0 | 0 | 5.236795 | 0.078 | 0.003 | 0 | 0 | 0 | collagen like tail subunit of asymmetric acetylcholinesterase [Source:MGI Symbol;Acc:MGI:1338761] | 6.461080 |
| Alveolar fibroblasts | Cass4 | 0 | 5.857382 | 0.058 | 0.001 | 0 | 0 | 3.877901 | 0.044 | 0.006 | 0 | 0 | 0 | Cas scaffolding protein family member 4 [Source:MGI Symbol;Acc:MGI:2444482] | 4.867642 |
| Alveolar fibroblasts | Mettl21e | 0 | 6.246712 | 0.043 | 0.001 | 0 | 0 | 4.249276 | 0.027 | 0.002 | 0 | 0 | 0 | methyltransferase like 21E [Source:MGI Symbol;Acc:MGI:2685837] | 5.247994 |
| Alveolar fibroblasts | Slc13a4 | 0 | 6.813269 | 0.035 | 0.000 | 0 | 0 | 3.907303 | 0.016 | 0.002 | 0 | 0 | 0 | solute carrier family 13 (sodium/sulfate symporters), member 4 [Source:MGI Symbol;Acc:MGI:2442367] | 5.360286 |
| Alveolar fibroblasts | Gm48662 | 0 | 6.335692 | 0.037 | 0.001 | 0 | 0 | 3.498129 | 0.028 | 0.003 | 0 | 0 | 0 | predicted gene, 48662 [Source:MGI Symbol;Acc:MGI:6098277] | 4.916910 |
| Alveolar fibroblasts | Ano5 | 0 | 6.605714 | 0.033 | 0.000 | 0 | 0 | 4.295960 | 0.022 | 0.001 | 0 | 0 | 0 | anoctamin 5 [Source:MGI Symbol;Acc:MGI:3576659] | 5.450837 |
| Alveolar fibroblasts | Ms4a4c | 0 | 7.359657 | 0.023 | 0.000 | 0 | 0 | 3.621467 | 0.012 | 0.001 | 0 | 0 | 0 | membrane-spanning 4-domains, subfamily A, member 4C [Source:MGI Symbol;Acc:MGI:1927656] | 5.490562 |
| Alveolar fibroblasts | Kcng1 | 0 | 5.457614 | 0.012 | 0.000 | 0 | 0 | 4.535511 | 0.015 | 0.001 | 0 | 0 | 0 | potassium voltage-gated channel, subfamily G, member 1 [Source:MGI Symbol;Acc:MGI:3616086] | 4.996563 |
| Alveolar fibroblasts | Cntn1 | 0 | 6.079370 | 0.014 | 0.000 | 0 | 0 | 3.802363 | 0.015 | 0.002 | 0 | 0 | 0 | contactin 1 [Source:MGI Symbol;Acc:MGI:105980] | 4.940867 |
| Alveolar fibroblasts | Gm13703 | 0 | 6.468622 | 0.012 | 0.000 | 0 | 0 | 3.758446 | 0.016 | 0.002 | 0 | 0 | 0 | predicted gene 13703 [Source:MGI Symbol;Acc:MGI:3651307] | 5.113534 |
| Peribronchial fibroblasts | Eln | 0 | 4.122702 | 0.861 | 0.139 | 0 | 0 | 2.825576 | 0.915 | 0.354 | 0 | 0 | 0 | elastin [Source:MGI Symbol;Acc:MGI:95317] | 3.474140 |
| Peribronchial fibroblasts | Col6a3 | 0 | 3.986600 | 0.788 | 0.077 | 0 | 0 | 2.967634 | 0.866 | 0.199 | 0 | 0 | 0 | collagen, type VI, alpha 3 [Source:MGI Symbol;Acc:MGI:88461] | 3.477117 |
| Peribronchial fibroblasts | Cxcl14 | 0 | 4.257647 | 0.628 | 0.057 | 0 | 0 | 2.384786 | 0.776 | 0.179 | 0 | 0 | 0 | C-X-C motif chemokine ligand 14 [Source:MGI Symbol;Acc:MGI:1888514] | 3.321217 |
| Peribronchial fibroblasts | Col5a1 | 0 | 3.677700 | 0.623 | 0.062 | 0 | 0 | 2.907386 | 0.812 | 0.174 | 0 | 0 | 0 | collagen, type V, alpha 1 [Source:MGI Symbol;Acc:MGI:88457] | 3.292543 |
| Peribronchial fibroblasts | Ltbp2 | 0 | 4.583246 | 0.454 | 0.031 | 0 | 0 | 2.912186 | 0.650 | 0.107 | 0 | 0 | 0 | latent transforming growth factor beta binding protein 2 [Source:MGI Symbol;Acc:MGI:99502] | 3.747716 |
| Peribronchial fibroblasts | Lamc3 | 0 | 4.674723 | 0.284 | 0.016 | 0 | 0 | 2.043056 | 0.149 | 0.033 | 0 | 0 | 0 | laminin gamma 3 [Source:MGI Symbol;Acc:MGI:1344394] | 3.358890 |
| Peribronchial fibroblasts | D430041D05Rik | 0 | 4.960910 | 0.169 | 0.007 | 0 | 0 | 2.454889 | 0.080 | 0.013 | 0 | 0 | 0 | RIKEN cDNA D430041D05 gene [Source:MGI Symbol;Acc:MGI:2181743] | 3.707900 |
| Peribronchial fibroblasts | Ccn4 | 0 | 4.260328 | 0.177 | 0.012 | 0 | 0 | 2.546925 | 0.403 | 0.063 | 0 | 0 | 0 | cellular communication network factor 4 [Source:MGI Symbol;Acc:MGI:1197008] | 3.403627 |
| Peribronchial fibroblasts | Col5a3 | 0 | 3.895073 | 0.164 | 0.011 | 0 | 0 | 3.512139 | 0.363 | 0.048 | 0 | 0 | 0 | collagen, type V, alpha 3 [Source:MGI Symbol;Acc:MGI:1858212] | 3.703606 |
| Peribronchial fibroblasts | Col28a1 | 0 | 5.387062 | 0.069 | 0.002 | 0 | 0 | 3.256865 | 0.285 | 0.032 | 0 | 0 | 0 | collagen, type XXVIII, alpha 1 [Source:MGI Symbol;Acc:MGI:2685312] | 4.321963 |
| Peribronchial fibroblasts | Adamts12 | 0 | 4.252873 | 0.108 | 0.007 | 0 | 0 | 2.570841 | 0.255 | 0.032 | 0 | 0 | 0 | ADAM metallopeptidase with thrombospondin type 1 motif 12 [Source:MGI Symbol;Acc:MGI:2146046] | 3.411857 |
| Peribronchial fibroblasts | Gpr176 | 0 | 5.281970 | 0.026 | 0.001 | 0 | 0 | 2.582730 | 0.121 | 0.016 | 0 | 0 | 0 | G protein-coupled receptor 176 [Source:MGI Symbol;Acc:MGI:2685858] | 3.932350 |
| Peribronchial fibroblasts | Megf10 | 0 | 4.226124 | 0.037 | 0.002 | 0 | 0 | 3.283944 | 0.093 | 0.008 | 0 | 0 | 0 | multiple EGF-like-domains 10 [Source:MGI Symbol;Acc:MGI:2685177] | 3.755034 |
| Peribronchial fibroblasts | C1qtnf3 | 0 | 4.753842 | 0.162 | 0.007 | 0 | 0 | 2.134169 | 0.044 | 0.009 | 0 | 0 | 0 | C1q and tumor necrosis factor related protein 3 [Source:MGI Symbol;Acc:MGI:1932136] | 3.444005 |
| Peribronchial fibroblasts | A2m | 0 | 4.166858 | 0.017 | 0.001 | 0 | 0 | 2.732664 | 0.075 | 0.010 | 0 | 0 | 0 | alpha-2-macroglobulin [Source:MGI Symbol;Acc:MGI:2449119] | 3.449761 |
| Peribronchial fibroblasts | Col24a1 | 0 | 4.986037 | 0.015 | 0.001 | 0 | 0 | 3.029205 | 0.049 | 0.004 | 0 | 0 | 0 | collagen, type XXIV, alpha 1 [Source:MGI Symbol;Acc:MGI:1918605] | 4.007621 |
| Peribronchial fibroblasts | Tmem200a | 0 | 4.619795 | 0.052 | 0.002 | 0 | 0 | 2.306634 | 0.067 | 0.009 | 0 | 0 | 0 | transmembrane protein 200A [Source:MGI Symbol;Acc:MGI:1924470] | 3.463214 |
| Peribronchial fibroblasts | Hoxb8 | 0 | 4.778536 | 0.017 | 0.001 | 0 | 0 | 2.253017 | 0.063 | 0.010 | 0 | 0 | 0 | homeobox B8 [Source:MGI Symbol;Acc:MGI:96189] | 3.515777 |
| Peribronchial fibroblasts | Mrgprf | 0 | 4.672262 | 0.058 | 0.003 | 0 | 0 | 1.921079 | 0.035 | 0.005 | 0 | 0 | 0 | MAS-related GPR, member F [Source:MGI Symbol;Acc:MGI:2384823] | 3.296670 |
| Peribronchial fibroblasts | Gm5084 | 0 | 5.185958 | 0.020 | 0.001 | 0 | 0 | 2.519955 | 0.015 | 0.003 | 0 | 0 | 0 | predicted gene 5084 [Source:MGI Symbol;Acc:MGI:3647835] | 3.852957 |
# Remove unneeded objects.
rm(pdgfra_umap, n_cells, `Alveolar fibroblasts`, `Peribronchial fibroblasts`, cluster, PDGFRA_markers)
Based on the Azimuth annotation, let’s visualize which fibroblast expresses PDGFR-b.
# Plot the marker expression in the annotated UMAP.
Idents(seurat_cluster) <- "predicted.ann_finest_level"
pdgfrb_umap <- FeaturePlot(seurat_cluster,
reduction = "umap",
features = "Pdgfrb",
label = FALSE,
order = TRUE,
cols = c("#01013f", "#59974d", "#fff000")) +
ggtitle("PDGFR-b expression") +
theme(axis.text.x = element_text(size = 8),
axis.text.y = element_text(size = 8))
LabelClusters(pdgfrb_umap, id = "ident", color = "#e97451", fontface = "bold", repel = TRUE)
## Warning: ggrepel: 6 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
# Confirm the cluster(s) that express the marker.
DotPlot(seurat_cluster, features = "Pdgfrb",
assay = "RNA", cols = c("#d6cfc7", "#01013f"))
It looks like PDGFR-b is mostly expressed by pericytes. Let’s see if we can find out the markers that define this fibroblast.
# Find the cell counts for pericytes in each treatment.
n_cells <- FetchData(seurat_cluster,
vars = c("predicted.ann_finest_level", "treatment")) %>%
dplyr::count(predicted.ann_finest_level, treatment) %>%
subset(., predicted.ann_finest_level == "Pericytes")
# Examine the number of cells between both treatments.
ggplot(n_cells, aes(x = predicted.ann_finest_level, y = n, fill = treatment)) +
geom_bar(position = position_dodge(), stat = "identity") +
geom_text(aes(label = n), size = 2.5, vjust = -0.2, position = position_dodge(1)) +
theme_classic() +
theme(axis.text.x = element_text(size = 8))
# Find conserved markers in the pericytes cluster.
source("~/Documents/Work/UnityHealth/Data_Projects/technical_scripts/get_conserved_markers.R")
pericytes_markers <- get_conserved_markers(seuobj = seurat_cluster,
annotations = annotations,
cluster = "Pericytes",
expt_condition = "treatment")
## Testing group Sham: (Pericytes) vs (AT2, Alveolar fibroblasts, EC general capillary, Adventitial fibroblasts, AT1, Transitional Club-AT2, Multiciliated (non-nasal), EC aerocyte capillary, Basal resting, EC arterial, Lymphatic EC mature, EC venous systemic, Club (non-nasal), Peribronchial fibroblasts, EC venous pulmonary, AT2 proliferating, Suprabasal, Myofibroblasts, Fibromyocytes, Smooth muscle, Lymphatic EC differentiating, Lymphatic EC proliferating, DC2, Club (nasal), SM activated stress response, Goblet (nasal), Deuterosomal, Mast cells, Neuroendocrine, CD8 T cells, CD4 T cells)
## Testing group Bleomycin: (Pericytes) vs (EC venous systemic, Multiciliated (non-nasal), Adventitial fibroblasts, AT1, EC general capillary, Lymphatic EC differentiating, Lymphatic EC mature, AT2, Peribronchial fibroblasts, Alveolar fibroblasts, CD8 T cells, EC aerocyte capillary, Smooth muscle, EC arterial, Myofibroblasts, EC venous pulmonary, SM activated stress response, Lymphatic EC proliferating, Goblet (nasal), Basal resting, Neuroendocrine, Club (non-nasal), DC2, Transitional Club-AT2, Ionocyte, CD4 T cells, Club (nasal), AT2 proliferating, Fibromyocytes, Suprabasal, B cells, Non-classical monocytes, Mast cells, SMG duct, Plasma cells, Mesothelium, Alveolar macrophages, Subpleural fibroblasts, Multiciliated (nasal), Deuterosomal)
# Extract top 20 markers in the cluster.
pericytes_markers <- pericytes_markers %>%
mutate(avg_fc = (Bleomycin_avg_log2FC + Sham_avg_log2FC)/2) %>%
group_by(cluster_id) %>%
top_n(n = 20,
wt = avg_fc)
kable(pericytes_markers)
| cluster_id | gene | Sham_p_val | Sham_avg_log2FC | Sham_pct.1 | Sham_pct.2 | Sham_p_val_adj | Bleomycin_p_val | Bleomycin_avg_log2FC | Bleomycin_pct.1 | Bleomycin_pct.2 | Bleomycin_p_val_adj | max_pval | minimump_p_val | description | avg_fc |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Pericytes | Trpc6 | 0 | 10.144589 | 0.312 | 0.000 | 0 | 0 | 8.986716 | 0.473 | 0.006 | 0 | 0 | 0 | transient receptor potential cation channel, subfamily C, member 6 [Source:MGI Symbol;Acc:MGI:109523] | 9.565653 |
| Pericytes | Higd1b | 0 | 10.102512 | 0.312 | 0.001 | 0 | 0 | 11.049704 | 0.582 | 0.001 | 0 | 0 | 0 | HIG1 domain family, member 1B [Source:MGI Symbol;Acc:MGI:1922939] | 10.576108 |
| Pericytes | Vtn | 0 | 12.722170 | 0.250 | 0.000 | 0 | 0 | 9.826647 | 0.273 | 0.001 | 0 | 0 | 0 | vitronectin [Source:MGI Symbol;Acc:MGI:98940] | 11.274409 |
| Pericytes | Fam162b | 0 | 9.215323 | 0.250 | 0.001 | 0 | 0 | 10.871935 | 0.273 | 0.000 | 0 | 0 | 0 | family with sequence similarity 162, member B [Source:MGI Symbol;Acc:MGI:1924546] | 10.043629 |
| Pericytes | Padi3 | 0 | 14.691993 | 0.125 | 0.000 | 0 | 0 | 8.370606 | 0.036 | 0.000 | 0 | 0 | 0 | peptidyl arginine deiminase, type III [Source:MGI Symbol;Acc:MGI:1338891] | 11.531300 |
| Pericytes | Cstdc2 | 0 | 11.446853 | 0.125 | 0.000 | 0 | 0 | 11.276197 | 0.182 | 0.000 | 0 | 0 | 0 | cystatin domain containing 2 [Source:MGI Symbol;Acc:MGI:1924955] | 11.361525 |
| Pericytes | Adcy8 | 0 | 10.546986 | 0.062 | 0.000 | 0 | 0 | 11.174731 | 0.218 | 0.000 | 0 | 0 | 0 | adenylate cyclase 8 [Source:MGI Symbol;Acc:MGI:1341110] | 10.860859 |
| Pericytes | Vsnl1 | 0 | 9.084571 | 0.500 | 0.003 | 0 | 0 | 6.903979 | 0.182 | 0.005 | 0 | 0 | 0 | visinin-like 1 [Source:MGI Symbol;Acc:MGI:1349453] | 7.994275 |
| Pericytes | Lef1 | 0 | 10.488495 | 0.250 | 0.001 | 0 | 0 | 6.596202 | 0.200 | 0.009 | 0 | 0 | 0 | lymphoid enhancer binding factor 1 [Source:MGI Symbol;Acc:MGI:96770] | 8.542348 |
| Pericytes | Cox4i2 | 0 | 8.255949 | 0.688 | 0.036 | 0 | 0 | 9.229187 | 0.691 | 0.021 | 0 | 0 | 0 | cytochrome c oxidase subunit 4I2 [Source:MGI Symbol;Acc:MGI:2135755] | 8.742568 |
| Pericytes | Ndufa4l2 | 0 | 8.250385 | 0.688 | 0.010 | 0 | 0 | 7.470427 | 0.636 | 0.022 | 0 | 0 | 0 | Ndufa4, mitochondrial complex associated like 2 [Source:MGI Symbol;Acc:MGI:3039567] | 7.860406 |
| Pericytes | Olfr553 | 0 | 12.274668 | 0.062 | 0.000 | 0 | 0 | 9.520471 | 0.018 | 0.000 | 0 | 0 | 0 | NA | 10.897570 |
| Pericytes | Fabp7 | 0 | 10.874978 | 0.125 | 0.000 | 0 | 0 | 6.095756 | 0.036 | 0.001 | 0 | 0 | 0 | fatty acid binding protein 7, brain [Source:MGI Symbol;Acc:MGI:101916] | 8.485367 |
| Pericytes | Kcnj12 | 0 | 10.063393 | 0.188 | 0.001 | 0 | 0 | 7.428102 | 0.073 | 0.002 | 0 | 0 | 0 | potassium inwardly-rectifying channel, subfamily J, member 12 [Source:MGI Symbol;Acc:MGI:108495] | 8.745747 |
| Pericytes | Card9 | 0 | 8.304698 | 0.062 | 0.001 | 0 | 0 | 7.716783 | 0.164 | 0.002 | 0 | 0 | 0 | caspase recruitment domain family, member 9 [Source:MGI Symbol;Acc:MGI:2685628] | 8.010741 |
| Pericytes | Gucy1b1 | 0 | 8.547001 | 0.688 | 0.016 | 0 | 0 | 7.404685 | 0.727 | 0.050 | 0 | 0 | 0 | guanylate cyclase 1, soluble, beta 1 [Source:MGI Symbol;Acc:MGI:1860604] | 7.975843 |
| Pericytes | Agtr1a | 0 | 8.316184 | 0.188 | 0.004 | 0 | 0 | 8.213101 | 0.218 | 0.006 | 0 | 0 | 0 | angiotensin II receptor, type 1a [Source:MGI Symbol;Acc:MGI:87964] | 8.264643 |
| Pericytes | Trarg1 | 0 | 8.244735 | 0.125 | 0.001 | 0 | 0 | 7.507260 | 0.091 | 0.002 | 0 | 0 | 0 | trafficking regulator of GLUT4 (SLC2A4) 1 [Source:MGI Symbol;Acc:MGI:3029307] | 7.875998 |
| Pericytes | Kcnq5 | 0 | 8.681899 | 0.125 | 0.001 | 0 | 0 | 8.037112 | 0.145 | 0.003 | 0 | 0 | 0 | potassium voltage-gated channel, subfamily Q, member 5 [Source:MGI Symbol;Acc:MGI:1924937] | 8.359505 |
| Pericytes | Tmem200a | 0 | 9.122794 | 0.250 | 0.002 | 0 | 0 | 6.651709 | 0.255 | 0.014 | 0 | 0 | 0 | transmembrane protein 200A [Source:MGI Symbol;Acc:MGI:1924470] | 7.887252 |
# Remove unneeded objects.
rm(pdgfrb_umap, n_cells, pericytes_markers)
Based on the Azimuth annotation, let’s visualize which epithelial cells express Krt5.
# Plot the marker expression in the annotated UMAP.
Idents(seurat_cluster) <- "predicted.ann_finest_level"
Krt5_umap <- FeaturePlot(seurat_cluster,
reduction = "umap",
features = "Krt5",
label = FALSE,
order = TRUE,
cols = c("#01013f", "#59974d", "#fff000")) +
ggtitle("Krt5+ expression") +
theme(axis.text.x = element_text(size = 8),
axis.text.y = element_text(size = 8))
LabelClusters(Krt5_umap, id = "ident", color = "#e97451", fontface = "bold", repel = TRUE)
## Warning: ggrepel: 6 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
# Find out which clusters express the marker.
DotPlot(seurat_cluster, features = "Krt5",
assay = "RNA", cols = c("#d6cfc7", "#01013f"))
It looks like Krt5+ is mostly expressed by basal resting and club (nasal) cells. Let’s see if we can find out the markers that define these epithelial cells.
# Find the cell counts for basal and club cells in each treatment.
n_cells <- FetchData(seurat_cluster,
vars = c("predicted.ann_finest_level", "treatment")) %>%
dplyr::count(predicted.ann_finest_level, treatment) %>%
subset(., predicted.ann_finest_level %in% c("Club (nasal)"))
# Examine the number of cells between both treatments.
ggplot(n_cells, aes(x = predicted.ann_finest_level, y = n, fill = treatment)) +
geom_bar(position = position_dodge(), stat = "identity") +
geom_text(aes(label = n), size = 2.5, vjust = -0.2, position = position_dodge(1)) +
theme_classic() +
theme(axis.text.x = element_text(size = 8))
# Find conserved markers in the club (nasal) cluster.
source("~/Documents/Work/UnityHealth/Data_Projects/technical_scripts/get_conserved_markers.R")
club_markers <- get_conserved_markers(seuobj = seurat_cluster,
annotations = annotations,
cluster = "Club (nasal)",
expt_condition = "treatment")
## Testing group Sham: (Club (nasal)) vs (AT2, Alveolar fibroblasts, EC general capillary, Adventitial fibroblasts, AT1, Transitional Club-AT2, Multiciliated (non-nasal), EC aerocyte capillary, Basal resting, EC arterial, Lymphatic EC mature, EC venous systemic, Club (non-nasal), Peribronchial fibroblasts, EC venous pulmonary, AT2 proliferating, Suprabasal, Myofibroblasts, Fibromyocytes, Smooth muscle, Lymphatic EC differentiating, Lymphatic EC proliferating, DC2, SM activated stress response, Pericytes, Goblet (nasal), Deuterosomal, Mast cells, Neuroendocrine, CD8 T cells, CD4 T cells)
## Testing group Bleomycin: (Club (nasal)) vs (EC venous systemic, Multiciliated (non-nasal), Adventitial fibroblasts, AT1, EC general capillary, Lymphatic EC differentiating, Lymphatic EC mature, AT2, Peribronchial fibroblasts, Alveolar fibroblasts, CD8 T cells, EC aerocyte capillary, Smooth muscle, EC arterial, Myofibroblasts, EC venous pulmonary, SM activated stress response, Lymphatic EC proliferating, Goblet (nasal), Basal resting, Neuroendocrine, Club (non-nasal), DC2, Transitional Club-AT2, Ionocyte, CD4 T cells, Pericytes, AT2 proliferating, Fibromyocytes, Suprabasal, B cells, Non-classical monocytes, Mast cells, SMG duct, Plasma cells, Mesothelium, Alveolar macrophages, Subpleural fibroblasts, Multiciliated (nasal), Deuterosomal)
# Extract top 20 markers in the cluster.
club_markers <- head(club_markers, 20)
kable(club_markers)
| cluster_id | gene | Sham_p_val | Sham_avg_log2FC | Sham_pct.1 | Sham_pct.2 | Sham_p_val_adj | Bleomycin_p_val | Bleomycin_avg_log2FC | Bleomycin_pct.1 | Bleomycin_pct.2 | Bleomycin_p_val_adj | max_pval | minimump_p_val | description |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Club (nasal) | Ifi202b | 0 | 9.536563 | 0.8 | 0.001 | 0 | 0 | 7.426929 | 0.854 | 0.005 | 0 | 0 | 0 | interferon activated gene 202B [Source:MGI Symbol;Acc:MGI:1347083] |
| Club (nasal) | 4833423E24Rik | 0 | 9.635913 | 0.8 | 0.001 | 0 | 0 | 7.632552 | 0.812 | 0.004 | 0 | 0 | 0 | NA |
| Club (nasal) | Tns4 | 0 | 9.326516 | 0.8 | 0.001 | 0 | 0 | 5.419853 | 0.229 | 0.004 | 0 | 0 | 0 | tensin 4 [Source:MGI Symbol;Acc:MGI:2144377] |
| Club (nasal) | Dapl1 | 0 | 8.420025 | 0.8 | 0.002 | 0 | 0 | 7.203046 | 0.875 | 0.006 | 0 | 0 | 0 | death associated protein-like 1 [Source:MGI Symbol;Acc:MGI:1923997] |
| Club (nasal) | Krt17 | 0 | 9.644663 | 0.8 | 0.002 | 0 | 0 | 7.569699 | 0.792 | 0.006 | 0 | 0 | 0 | keratin 17 [Source:MGI Symbol;Acc:MGI:96691] |
| Club (nasal) | Krt5 | 0 | 9.360105 | 0.8 | 0.002 | 0 | 0 | 7.276484 | 0.792 | 0.005 | 0 | 0 | 0 | keratin 5 [Source:MGI Symbol;Acc:MGI:96702] |
| Club (nasal) | Gpr87 | 0 | 10.925382 | 0.6 | 0.000 | 0 | 0 | 7.772418 | 0.333 | 0.001 | 0 | 0 | 0 | G protein-coupled receptor 87 [Source:MGI Symbol;Acc:MGI:1934133] |
| Club (nasal) | Defb1 | 0 | 7.905752 | 0.6 | 0.001 | 0 | 0 | 6.463990 | 0.583 | 0.005 | 0 | 0 | 0 | defensin beta 1 [Source:MGI Symbol;Acc:MGI:1096878] |
| Club (nasal) | Trim29 | 0 | 8.438270 | 0.6 | 0.001 | 0 | 0 | 6.999176 | 0.542 | 0.004 | 0 | 0 | 0 | tripartite motif-containing 29 [Source:MGI Symbol;Acc:MGI:1919419] |
| Club (nasal) | Slc25a21 | 0 | 9.397660 | 0.6 | 0.001 | 0 | 0 | 6.653883 | 0.146 | 0.001 | 0 | 0 | 0 | solute carrier family 25 (mitochondrial oxodicarboxylate carrier), member 21 [Source:MGI Symbol;Acc:MGI:2445059] |
| Club (nasal) | Mdga2 | 0 | 9.436134 | 0.6 | 0.001 | 0 | 0 | 5.148588 | 0.229 | 0.004 | 0 | 0 | 0 | MAM domain containing glycosylphosphatidylinositol anchor 2 [Source:MGI Symbol;Acc:MGI:2444706] |
| Club (nasal) | Pkp1 | 0 | 10.516065 | 0.4 | 0.000 | 0 | 0 | 8.036203 | 0.438 | 0.002 | 0 | 0 | 0 | plakophilin 1 [Source:MGI Symbol;Acc:MGI:1328359] |
| Club (nasal) | Foxe1 | 0 | 8.538125 | 0.4 | 0.000 | 0 | 0 | 6.378833 | 0.229 | 0.002 | 0 | 0 | 0 | forkhead box E1 [Source:MGI Symbol;Acc:MGI:1353500] |
| Club (nasal) | Sall3 | 0 | 9.127323 | 0.4 | 0.000 | 0 | 0 | 7.131397 | 0.208 | 0.002 | 0 | 0 | 0 | spalt like transcription factor 3 [Source:MGI Symbol;Acc:MGI:109295] |
| Club (nasal) | Serpinb5 | 0 | 8.441000 | 0.4 | 0.001 | 0 | 0 | 6.108664 | 0.562 | 0.007 | 0 | 0 | 0 | serine (or cysteine) peptidase inhibitor, clade B, member 5 [Source:MGI Symbol;Acc:MGI:109579] |
| Club (nasal) | Dlk2 | 0 | 8.265797 | 0.6 | 0.001 | 0 | 0 | 6.617187 | 0.479 | 0.003 | 0 | 0 | 0 | delta like non-canonical Notch ligand 2 [Source:MGI Symbol;Acc:MGI:2146838] |
| Club (nasal) | Fmo6 | 0 | 8.337107 | 0.6 | 0.002 | 0 | 0 | 7.830383 | 0.604 | 0.003 | 0 | 0 | 0 | flavin containing monooxygenase 6 [Source:MGI Symbol;Acc:MGI:2681841] |
| Club (nasal) | Dsc3 | 0 | 10.278129 | 0.2 | 0.000 | 0 | 0 | 7.025309 | 0.208 | 0.001 | 0 | 0 | 0 | desmocollin 3 [Source:MGI Symbol;Acc:MGI:1194993] |
| Club (nasal) | Pax9 | 0 | 6.485695 | 0.8 | 0.005 | 0 | 0 | 6.282572 | 0.625 | 0.008 | 0 | 0 | 0 | paired box 9 [Source:MGI Symbol;Acc:MGI:97493] |
| Club (nasal) | Aqp3 | 0 | 8.515569 | 0.8 | 0.009 | 0 | 0 | 7.760175 | 0.917 | 0.009 | 0 | 0 | 0 | aquaporin 3 [Source:MGI Symbol;Acc:MGI:1333777] |
# Remove unneeded objects.
rm(Krt5_umap, n_cells, club_markers, get_conserved_markers, annotations)
# Remove any cells that are labelled nasal.
grep("nasal", seurat_cluster$predicted.ann_finest_level, value = TRUE) %>% unique() # Club (nasal), Goblet (nasal), Multiciliated (nasal).
## [1] "Multiciliated (non-nasal)" "Club (non-nasal)"
## [3] "Club (nasal)" "Goblet (nasal)"
## [5] "Multiciliated (nasal)"
dim(seurat_cluster)[2] # 80,505
## [1] 80505
seurat_cluster <- subset(seurat_cluster,
subset = predicted.ann_finest_level %in% c("Club (nasal)",
"Goblet (nasal)",
"Multiciliated (nasal)"),
invert = TRUE)
dim(seurat_cluster)[2] # 80,294 (211 cells removed)
## [1] 80294
# Remove any cells that are labelled mesothelial cells as they are pleural and are not specialized lung cells.
grep("Mesothelium", seurat_cluster$predicted.ann_finest_level, value = TRUE) %>% unique() # Mesothelium.
## [1] "Mesothelium"
dim(seurat_cluster)[2] # 80,294
## [1] 80294
seurat_cluster <- subset(seurat_cluster,
subset = predicted.ann_finest_level == "Mesothelium",
invert = TRUE)
dim(seurat_cluster)[2] # 80,290 (4 cells removed)
## [1] 80290
# Pull cell metadata from the Seurat object.
metadata <- seurat_cluster@meta.data
# Aggregate some annotated cell types.
# Find all fibroblasts and aggregate them together.
grep("fibroblast", metadata$predicted.ann_finest_level, value = TRUE) %>% unique()
## [1] "Alveolar fibroblasts" "Adventitial fibroblasts"
## [3] "Peribronchial fibroblasts" "Myofibroblasts"
## [5] "Subpleural fibroblasts"
metadata <- mutate(metadata,
predicted.ann_finest_level =
recode(predicted.ann_finest_level,
"Alveolar fibroblasts" = "Fibroblasts",
"Adventitial fibroblasts" = "Fibroblasts",
"Peribronchial fibroblasts" = "Fibroblasts",
"Subpleural fibroblasts" = "Fibroblasts"))
# Find all endothelial cells and aggregate them together.
grep("EC", metadata$predicted.ann_finest_level, value = TRUE) %>% unique()
## [1] "EC general capillary" "EC aerocyte capillary"
## [3] "EC arterial" "Lymphatic EC mature"
## [5] "EC venous systemic" "EC venous pulmonary"
## [7] "Lymphatic EC differentiating" "Lymphatic EC proliferating"
metadata <- mutate(metadata,
predicted.ann_finest_level =
recode(predicted.ann_finest_level,
"EC general capillary" = "Endothelial capillary",
"EC aerocyte capillary" = "Endothelial capillary",
"EC arterial" = "Endothelial arterial",
"EC venous systemic" = "Endothelial venous",
"EC venous pulmonary" = "Endothelial venous"))
metadata <- mutate(metadata,
predicted.ann_finest_level =
recode(predicted.ann_finest_level,
"Lymphatic EC mature" = "Endothelial lymphatic",
"Lymphatic EC differentiating" = "Endothelial lymphatic",
"Lymphatic EC proliferating" = "Endothelial lymphatic"))
# Find all smooth muscle cells and aggregate them together.
grep("SM|smooth muscle", metadata$predicted.ann_finest_level, value = TRUE) %>% unique()
## [1] "SM activated stress response" "SMG duct"
metadata <- mutate(metadata,
predicted.ann_finest_level =
recode(predicted.ann_finest_level,
"Smooth muscle" = "VSMCs",
"SM activated stress response" = "VSMCs"))
# Find all immune cells and aggregate them together.
metadata$predicted.ann_finest_level[metadata$predicted.ann_level_1 == "Immune"] %>% unique()
## [1] "DC2" "Mast cells"
## [3] "CD8 T cells" "Basal resting"
## [5] "CD4 T cells" "AT1"
## [7] "B cells" "Non-classical monocytes"
## [9] "Endothelial capillary" "Fibroblasts"
## [11] "Plasma cells" "Alveolar macrophages"
metadata <- mutate(metadata,
predicted.ann_finest_level =
recode(predicted.ann_finest_level,
"DC2" = "Immune",
"Mast cells" = "Immune",
"CD8 T cells" = "Immune",
"CD4 T cells" = "Immune",
"B cells" = "Immune",
"Non-classical monocytes" = "Immune",
"Plasma cells" = "Immune",
"Alveolar macrophages" = "Immune"))
# Get metadata column and rename.
metadata = (metadata %>%
subset(select = -cell_type) %>%
rename("title" = "mice_id",
"predicted.ann_finest_level" = "cell_type"))
# Add edited metadata back to Seurat object.
seurat_cluster@meta.data <- metadata
# Factorize the timepoint according to sequential order.
seurat_cluster$timepoint <- factor(x = seurat_cluster$timepoint,
levels = c("Sham", "Day 14", "Day 35"))
# Create a column in metadata slot that holds the treatment and timepoint information.
seurat_cluster$comparison_var = ifelse(seurat_cluster$timepoint == "Sham",
"Sham",
paste(seurat_cluster$treatment,
seurat_cluster$timepoint,
sep = "_"))
# Factorize the comparison_var according to sequential order.
seurat_cluster$comparison_var <- factor(x = seurat_cluster$comparison_var,
levels = c("Sham",
"Bleomycin_Day 14",
"Bleomycin_Day 35"))
# Set default idents of Seurat object.
Idents(seurat_cluster) <- "cell_type"
# Remove unneeded object.
rm(seurat_annotated, metadata)
# Set colour scheme based on annotated cell type.
celltype_col = c("AT1" = "#2f4f4f",
"AT2" = "#00bfff",
"Basal resting" = "#2e8b57",
"Club (non-nasal)" = "#8b008b",
"Endothelial arterial" = "#ffa500",
"Endothelial capillary" = "#808000",
"Endothelial lympathic" = "#00008b",
"Endothelial venous" = "#00ff00",
"Fibroblasts" = "#ff4500",
"Immune" = "#f08080",
"Ionocyte" = "#f0e68c",
"Multiciliated (non-nasal)" = "#ff00ff",
"Myofibroblasts" = "#c71585",
"Neuroendocrine" = "#00fa9a",
"Pericytes" = "#ff1493",
"Suprabasal" = "#eee8aa",
"Transitional Club-AT2" = "#9370db",
"VSMCs" = "#00ffff")
# Visualize annotated cluster with a UMAP.
DimPlot(seurat_cluster,
reduction = "umap",
group.by = "cell_type",
cols = celltype_col,
label = TRUE,
label.size = 4,
repel = TRUE,
raster = FALSE) +
ggtitle("Final cell type annotation") +
theme(plot.title = element_text(hjust = 0.5))
save(seurat_cluster,
file = paste0(supp_file_path, "data/preprocessed_seurat_obj.RData"))
Analysis codes are adapted from HBCtraining and Ouyang Lab with credits going to the original authors of the publications cited in the book. Reference materials are adapted from SVI Bioinformatics and Cellular Genomics Lab with credits going to the original authors.